This R Notebook generates figures in Sequencing-based Simulation Analysis section in manuscript.
Inputs:
simulation_seq_based_all_results.rds:
cell type deconvolution results of all methods, as well as the ground
truth in sequencing-based simulation analysis.sim_spatial_spot_loc.csv:
physical locations of spatial spots.Outputs:
[1] "R version 4.2.2 Patched (2022-11-10 r83330)"
[1] "Package ggplot2 version: 3.4.2"
[1] "Package ggpubr version: 0.6.0"
print(sprintf('Package %s version: %s', 'philentropy', packageVersion('philentropy'))) # JSD function[1] "Package philentropy version: 0.7.0"
file_name = file.path(home.dir, 'sim_spatial_spot_loc.csv')
loc_df = read.csv(file_name, row.names = 1, check.names = F)
print(sprintf('load data from %s', file_name))[1] "load data from /home/hill103/Documents/Spatial/Figures/Simulation_seq_based/sim_spatial_spot_loc.csv"
file_name = file.path(home.dir, 'simulation_seq_based_all_results.rds')
all_res = readRDS(file_name)
print(sprintf('load data from %s', file_name))[1] "load data from /home/hill103/Documents/Spatial/Figures/Simulation_seq_based/simulation_seq_based_all_results.rds"
Check the order of spots and cell types are consistent before performance evaluation.
Check whether negative values of estimated cell type proportions
exist, as negative values may cause error in JSD calculation and got
NaN. Replace them as 0.
for (scenario in names(all_res)) {
if (scenario != 'Truth') {
for (method_name in names(all_res[[scenario]])) {
tmp_df = all_res[[scenario]][[method_name]]
for (i in 1:nrow(tmp_df)) {
for (j in 1:ncol(tmp_df)) {
if (tmp_df[i, j] < 0) {
print(sprintf('%s: %s result: row %d (%s) column %d (%s) has negative value %g', scenario, method_name, i, row.names(tmp_df)[i], j, colnames(tmp_df)[j], tmp_df[i, j]))
# replace them with 0
all_res[[scenario]][[method_name]][i, j] = 0
}
}
}
}
}
}[1] "S1_int_ref: SpatialDWLS result: row 125 (x7534y3550) column 4 (eL5) has negative value -1.84573e-19"
[1] "S1_int_ref: SpatialDWLS result: row 143 (x6034y4050) column 4 (eL5) has negative value -2.1962e-18"
[1] "S1_int_ref: SpatialDWLS result: row 291 (x9034y7550) column 2 (eL2/3) has negative value -4.23995e-19"
[1] "S1_int_ref: SpatialDWLS result: row 449 (x5534y11550) column 4 (eL5) has negative value -4.11672e-19"
[1] "S1_ext_ref: SpatialDWLS result: row 1 (x34y50) column 4 (eL5) has negative value -1.96899e-19"
[1] "S1_ext_ref: SpatialDWLS result: row 43 (x5534y1550) column 2 (eL2/3) has negative value -7.23067e-19"
[1] "S1_ext_ref: SpatialDWLS result: row 45 (x6534y1550) column 2 (eL2/3) has negative value -2.65685e-14"
[1] "S1_ext_ref: SpatialDWLS result: row 45 (x6534y1550) column 3 (eL4) has negative value -2.92345e-14"
[1] "S1_ext_ref: SpatialDWLS result: row 51 (x9534y1550) column 2 (eL2/3) has negative value -2.71336e-16"
[1] "S1_ext_ref: SpatialDWLS result: row 51 (x9534y1550) column 3 (eL4) has negative value -1.02778e-15"
[1] "S1_ext_ref: SpatialDWLS result: row 51 (x9534y1550) column 4 (eL5) has negative value -1.07852e-17"
[1] "S1_ext_ref: SpatialDWLS result: row 86 (x9034y2550) column 2 (eL2/3) has negative value -1.19306e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 107 (x9034y3050) column 9 (PVALB) has negative value -3.80156e-19"
[1] "S1_ext_ref: SpatialDWLS result: row 159 (x3034y4550) column 3 (eL4) has negative value -1.70033e-16"
[1] "S1_ext_ref: SpatialDWLS result: row 261 (x4534y7050) column 3 (eL4) has negative value -1.34987e-14"
[1] "S1_ext_ref: SpatialDWLS result: row 261 (x4534y7050) column 4 (eL5) has negative value -1.10174e-17"
[1] "S1_ext_ref: SpatialDWLS result: row 271 (x9534y7050) column 2 (eL2/3) has negative value -4.90217e-19"
[1] "S1_ext_ref: SpatialDWLS result: row 271 (x9534y7050) column 5 (eL6) has negative value -1.03155e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 282 (x4534y7550) column 4 (eL5) has negative value -1.05241e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 293 (x10034y7550) column 4 (eL5) has negative value -5.88417e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 296 (x534y8050) column 5 (eL6) has negative value -1.55533e-20"
[1] "S1_ext_ref: SpatialDWLS result: row 296 (x534y8050) column 11 (SST) has negative value -9.29682e-23"
[1] "S1_ext_ref: SpatialDWLS result: row 297 (x1034y8050) column 2 (eL2/3) has negative value -1.14198e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 297 (x1034y8050) column 11 (SST) has negative value -3.95337e-20"
[1] "S1_ext_ref: SpatialDWLS result: row 308 (x7534y8050) column 5 (eL6) has negative value -1.46301e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 359 (x2034y9550) column 5 (eL6) has negative value -1.05317e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 388 (x6534y10050) column 4 (eL5) has negative value -3.49281e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 425 (x3534y11050) column 2 (eL2/3) has negative value -5.31581e-21"
[1] "S1_ext_ref: SpatialDWLS result: row 440 (x534y11550) column 5 (eL6) has negative value -1.37485e-19"
[1] "S1_ext_ref: SpatialDWLS result: row 463 (x2034y12050) column 3 (eL4) has negative value -6.39067e-16"
[1] "S1_ext_ref: SpatialDWLS result: row 467 (x4534y12050) column 4 (eL5) has negative value -1.49888e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 492 (x7034y12550) column 9 (PVALB) has negative value -8.46644e-20"
[1] "S1_ext_ref: SpatialDWLS result: row 523 (x2534y13550) column 4 (eL5) has negative value -2.521e-18"
[1] "S1_ext_ref: SpatialDWLS result: row 538 (x1034y14050) column 5 (eL6) has negative value -3.46966e-19"
[1] "S1_ext_ref: SpatialDWLS result: row 558 (x3034y14550) column 5 (eL6) has negative value -1.14858e-17"
[1] "S1_ext_ref: SpatialDWLS result: row 565 (x3034y15050) column 4 (eL5) has negative value -5.83219e-19"
Check whether estimated cell type proportions of spot are ALL 0s.
for (scenario in names(all_res)) {
if (scenario != 'Truth') {
for (method_name in names(all_res[[scenario]])) {
tmp_df = all_res[[scenario]][[method_name]]
for (i in 1:nrow(tmp_df)) {
if (sum(tmp_df[i, ]) == 0) {
print(sprintf('%s: %s result: row %d (%s) has ALL 0s', scenario, method_name, i, row.names(tmp_df)[i]))
}
}
}
}
}[1] "S1_ext_ref: SpatialDWLS result: row 2 (x534y50) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 3 (x1034y50) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 4 (x1534y50) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 5 (x2034y50) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 10 (x2534y550) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 11 (x4534y550) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 21 (x1034y1050) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 23 (x2034y1050) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 28 (x5534y1050) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 207 (x8034y5550) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 223 (x6034y6050) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 288 (x7534y7550) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 316 (x534y8550) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 318 (x1534y8550) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 319 (x2034y8550) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 341 (x3034y9050) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 383 (x3534y10050) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 513 (x7534y13050) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 529 (x6034y13550) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 550 (x9534y14050) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 553 (x34y14550) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 561 (x534y15050) has ALL 0s"
[1] "S1_ext_ref: SpatialDWLS result: row 563 (x2034y15050) has ALL 0s"
For the methods without cell type selection procedure, including cell2location, DestVI, CARD and SONAR, we put a hard-thresholding with cutoff 0.1 on the estimated cell type compositions to force the extremely small proportions to be zeros.
UPDATE: hard-thresholding were NOT applied, original estimated cell type proportions were used for performance evaluation.
4 performance measurements:
NOTE:
NA or NaN.binaryPredEvaluation = function(truth, pred) {
# Given an array of truth and predictions (either 0 or 1), calculate confusion matrix
# convert to factors to ensure get a full 2*2 confusion matrix
truth_factor = factor(truth, levels = c(0, 1))
pred_factor = factor(pred, levels = c(0, 1))
# Generate confusion matrix
conf_matrix = table(Actual = truth_factor, Predicted = pred_factor)
# Extract elements of the confusion matrix
TP = conf_matrix[2, 2]
FP = conf_matrix[1, 2]
FN = conf_matrix[2, 1]
TN = conf_matrix[1, 1]
# Calculate FDR (False Discovery Rate)
FDR = FP / (TP + FP)
# Calculate FNR (False Negative Rate)
FNR = FN / (TP + FN)
return(c(FDR, FNR))
}
calcPerformance = function(truth, pred) {
# calculate RMSE, JSD, correlation and FDR for each row
# inputs are matrix with rows as spots and columns as cell types, order has been checked to be consistent
stopifnot(all(row.names(truth) == row.names(pred)))
stopifnot(all(colnames(truth) == colnames(pred)))
# binary cell type proportions (0:absent; 1:present)
truth_binary = truth != 0
pred_binary = pred != 0
# in-place conversion from bool to 0/1 while keeping dimensions, row names and column names, as `as.numeric()` will "flattern" the original matrix
truth_binary[] = as.numeric(truth_binary)
pred_binary[] = as.numeric(pred_binary)
perform_df = data.frame(RMSE=numeric(), JSD=numeric(), Pearson=numeric(), FDR=numeric(), FNR=numeric(), stringsAsFactors = F)
for (i in 1:nrow(truth)) {
RMSE = sqrt(mean((truth[i,] - pred[i,]) ^ 2))
if (sum(pred[i,])>0 & sum(truth[i,])>0) {
JSD = philentropy::JSD(rbind(truth[i,], pred[i,]), unit = 'log2', est.prob = 'empirical')
} else {
JSD = 1
}
Pearson = cor.test(truth[i,], pred[i,])$estimate
tmp = binaryPredEvaluation(truth_binary[i,], pred_binary[i,])
FDR = tmp[1]
FNR = tmp[2]
perform_df[nrow(perform_df)+1, ] = c(RMSE, JSD, Pearson, FDR, FNR)
}
# also record spot names
stopifnot(nrow(perform_df) == nrow(truth))
perform_df['Spot'] = row.names(truth)
return(perform_df)
}
all_perform = list()
for (scenario in names(all_res)) {
if (scenario != 'Truth') {
all_perform[[scenario]] = list()
for (method_name in names(all_res[[scenario]])) {
all_perform[[scenario]][[method_name]] = calcPerformance(as.matrix(all_res[['Truth']]), as.matrix(all_res[[scenario]][[method_name]]))
}
}
}For rows with ALL 0s in SpatialDWLS performance,
replace NA Pearson’s correlation coefficient as 0,
NaN FDR as 0.
Including Table. Median RMSE, JSD, correlation, FDR and FNR of all methods in Scenario 1.
perform_raw_df = data.frame(Scenario=character(), Method=character(), Reference=character(), Spot=character(),
RMSE=numeric(), JSD=numeric(), Pearson=numeric(), FDR=numeric(), FNR=numeric(),
stringsAsFactors = F)
# calculate median performance across all spatial spots for all methods
perform_median_df = data.frame(Scenario=character(), Method=character(), Reference=character(),
median_RMSE=numeric(), median_JSD=numeric(), median_Pearson=numeric(), median_FDR=numeric(), median_FNR=numeric(),
stringsAsFactors = F)
for (scenario in names(all_perform)) {
if (scenario == "S1_int_ref") {
this_scenario = 'Scenario 1'
this_ref = 'Internal'
} else if (scenario == "S1_ext_ref") {
this_scenario = 'Scenario 1'
this_ref = 'External'
} else if (scenario == 'S2_ext_ref') {
this_scenario = 'Scenario 2'
this_ref = 'External'
} else {
this_scenario = 'Scenario 3'
this_ref = 'External'
}
for (method_name in names(all_perform[[scenario]])) {
tmp_df = all_perform[[scenario]][[method_name]]
tmp_df['Scenario'] = this_scenario
tmp_df['Method'] = method_name
tmp_df['Reference'] = this_ref
perform_raw_df = rbind(perform_raw_df, tmp_df[, c('Scenario', 'Method', 'Reference', 'Spot', 'RMSE', 'JSD', 'Pearson', 'FDR', 'FNR')])
# use list() instead of c() to keep data type, otherwise c() will coerce all elements to a common type like string
perform_median_df[nrow(perform_median_df)+1, ] = list(this_scenario, method_name, this_ref,
round(median(tmp_df$RMSE), 3),
round(median(tmp_df$JSD), 3),
round(median(tmp_df$Pearson), 3),
round(median(tmp_df$FDR), 3),
round(median(tmp_df$FNR), 3))
}
}
# set method column as factors
perform_raw_df['Method'] = factor(perform_raw_df$Method, levels = names(method_color))
perform_median_df[, c('Scenario', 'Method', 'Reference', 'median_RMSE', 'median_JSD', 'median_Pearson', 'median_FDR', 'median_FNR')]if (save_file) {
file_name = file.path(home.dir, 'Fig_sequencing_based_performance.pdf')
cairo_pdf(file_name, height=6, width=6.5, onefile=T)
print(sprintf('figures saved in file %s', file_name))
}
plot_df = perform_raw_df %>%
filter(Scenario=='Scenario 1')
g_list = list()
for (perform_ind in c('RMSE', 'Pearson', 'JSD', 'FDR')) {
g_list[[perform_ind]] = ggplot(plot_df, aes(x=Method, y=.data[[perform_ind]], fill=Method)) +
geom_boxplot(position=position_dodge(), outlier.shape=NA) +
scale_fill_manual(values=method_color) +
theme_classic() +
theme(strip.text = element_text(size=10),
axis.text = element_text(color="black"),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.title = element_blank()) +
facet_grid(~Reference)
}
g_list[['Pearson']] = g_list[['Pearson']] + geom_hline(yintercept=0, color="red", linetype="dashed")
ggpubr::ggarrange(plotlist=g_list, ncol=2, nrow=2, common.legend=TRUE, legend="right")ggplot(plot_df, aes(x=Method, y=FNR, fill=Method)) +
geom_boxplot(position=position_dodge(), outlier.shape=NA) +
scale_fill_manual(values=method_color) +
theme_classic() +
theme(strip.text = element_text(size=10),
axis.text = element_text(color="black"),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.title = element_blank()) +
facet_grid(~Reference)if (save_file) {
file_name = file.path(home.dir, 'Fig_sequencing_based_piechart_internal_ref.pdf')
cairo_pdf(file_name, height=21, width=19.5, onefile=T)
print(sprintf('figures saved in file %s', file_name))
}
g_list = list()
all_methods = names(method_color)
all_cts = colnames(all_res$Truth)
for (one_method in all_methods) {
plot_df = merge(all_res$S1_int_ref[[one_method]], loc_df[, c('x', 'y')], by='row.names')
g_list[[length(g_list)+1]] = ggplot() +
geom_scatterpie(aes(x=x, y=y), data=plot_df, cols=all_cts) +
scale_fill_manual(values=my_color) +
theme_classic() +
ggtitle(one_method) +
theme(axis.title=element_blank(), axis.text=element_blank(), axis.ticks=element_blank(), axis.line=element_blank(),
legend.title=element_blank(), legend.text=element_text(size=15),
plot.title=element_text(size=17, face='bold', hjust=0.1))
}
ggpubr::ggarrange(plotlist = g_list, ncol = 3, nrow = 3, align = 'hv', common.legend = T, legend = 'right', font.label = list(size=17))if (save_file) {
file_name = file.path(home.dir, 'Fig_sequencing_based_piechart_external_ref.pdf')
cairo_pdf(file_name, height=21, width=19.5, onefile=T)
print(sprintf('figures saved in file %s', file_name))
}
g_list = list()
all_methods = names(method_color)
all_cts = colnames(all_res$Truth)
for (one_method in all_methods) {
plot_df = merge(all_res$S1_ext_ref[[one_method]], loc_df[, c('x', 'y')], by='row.names')
g_list[[length(g_list)+1]] = ggplot() +
geom_scatterpie(aes(x=x, y=y), data=plot_df, cols=all_cts) +
scale_fill_manual(values=my_color) +
theme_classic() +
ggtitle(one_method) +
theme(axis.title=element_blank(), axis.text=element_blank(), axis.ticks=element_blank(), axis.line=element_blank(),
legend.title=element_blank(), legend.text=element_text(size=15),
plot.title=element_text(size=17, face='bold', hjust=0.1))
}
ggpubr::ggarrange(plotlist = g_list, ncol = 3, nrow = 3, align = 'hv', common.legend = T, legend = 'right', font.label = list(size=17))